# Who Cares if You Vote? Partisan agreement and injunctive norms of voting
# Edward Fieldhouse (1), David Cutts (2), Jack Bailey (1)
# (1) The University of Manchester, (2) The University of Birmingham


# 1. Housekeeping ---------------------------------------------------------

# The following code requires that you have first opened the associated
# project file. If not, click the button on the top right and open it
# ("Open Project...").

# Note also that the Bayesian methods we use below rely on having installed
# the probabilistic programming language Stan. If you have yet to install
# Stan, please see https://mc-stan.org/users/interfaces/.

# Load packages

library(tidyverse)
library(tidybayes)
library(magrittr)
library(haven)
library(jbmisc)   # devtools::install_github("jackobailey/jbmisc)
library(brms)
library(here)


# Load data from the British Election Study Internet Panel. Note: If you
# plan to use this data, please make sure that you download the most recent
# version available at https://www.britishelectionstudy.com.

dta <- read_sav(here("_data", "BES2015_W4_v4.0.sav.zip"))



# 2. Transform data -------------------------------------------------------

# Load data and select variables

dta <-
  dta %>%
  select(
    # Respondent-level covariates
    wt,
    age = Age,
    female = gender,
    edu = edlevel,
    edu2 = education,
    married = marital,
    discuss = discussPolDays,
    att = polAttention,
    no_und = efficacyNotUnderstand,
    pol_care = efficacyPolCare,
    duty = dutyToVote2,
    pid = partyId,
    sq = partyIdSqueeze,
    str = partyIdStrength,
    turn = profile_turnout_2015,
    
    # Discussant-level covariates
    rel = num_range("relationshipName", 1:3),
    app = num_range("discussantApprovalVoteName", 1:3),
    vote = num_range("discussantVoteName", 1:3),
    pol = num_range("discussPolDaysD", 1:3)
  )


# First, we'll deal with all of the respondent-level variables. To get started
# we'll start with age, which we're going to convert to z-scores to help the
# Bayesian models we fit later converge.

dta$age <-
  dta$age %>% 
  mark_na(-8:-9) %>% 
  as.numeric() %>% 
  rescale(type = "z")


# Next, we'll convert the "female" variable to a factor that takes the value 1
# where the respondent is female and 0 where they are male.

dta$female <-
  dta$female %>%
  as_factor()


# We now need to deal with our two education variables. We need "edu2" only as
# it contains information on who has some other qualification and whose are not
# known. We'll add these categories to "edu1", then populate them from "edu2".

dta$edu <-
  dta$edu %>%
  as_factor() %>%
  factor(levels = c("No qualifications",
                    "Below GCSE",
                    "GCSE",
                    "A-level",
                    "Undergraduate",
                    "Postgrad",
                    "Other",
                    "Unknown"),
         labels = c("None",
                    "Below GCSE",
                    "GCSE",
                    "A-level",
                    "Undergraduate",
                    "Postgraduate",
                    "Other",
                    "Unknown"))

dta$edu[dta$edu2 == 18] <- "Other"
dta$edu[is.na(dta$edu) == T] <- "Unknown"


# Now we'll move on to marital status. We want to convert this to a new variable
# that tells us if the respondent is cohabiting. This is because we expect similar
# "marriage" effects for those who are married, living as married, or in civil
# partnerships.

dta$cohabit <-
  dta$married %>%
  as_factor() %>%
  as_dummy(c("Married", "Living as married", "Civil Partnership")) %>%
  factor(labels = c("Other",
                    "Cohabiting"))


# There are now a host of attitudinal variables that we must deal with. These each
# have roughly the same issues. And, in most cases, we can simply mark "Don't know"
# cases as missing, convert them to numeric, and include them in our model. We'll
# do them all at once.

dta$discuss <-
  dta$discuss %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>% 
  subtract(1)


dta$att <-
  dta$att %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>% 
  subtract(1)


dta$no_und <-
  dta$no_und %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>%
  subtract(1)


dta$pol_care <-
  dta$pol_care %>%
  as_factor() %>% 
  mark_na("Don't know") %>%
  as.numeric() %>%
  subtract(1)


dta$duty <- 
  dta$duty %>% 
  as_factor() %>% 
  mark_na("Don't know") %>% 
  as.numeric() %>% 
  subtract(1)


# Now we have to convert the turnout variable to a dummy that takes the value 1
# where the respondent said that they voted and 0 otherwise.

dta$turn <-
  dta$turn %>%
  as_factor() %>% 
  as_dummy("Yes, voted") %>%
  factor(labels = c("Didn't vote", "Voted"))


# Next we have to create a party identification variable that combines the two
# pid variables in our data. First, we use the responses from the squeeze variable
# "sq" to populate our main pid variable where respondents said that they identified
# with no party (10) or did not know (9999). We then convert the resulting variable
# to a factor so that we can include it in our model.

dta$pid[dta$pid %in% c(10, 9999)] <- dta$sq[dta$pid %in% c(10, 9999)]

# Party ID
dta$pid <-
  dta$pid %>%
  as_factor() %>%
  droplevels() %>% 
  factor(labels = c("Conservative",
                    "Labour",
                    "Liberal Democrat",
                    "SNP",
                    "Plaid Cymru",
                    "UKIP",
                    "Green Party",
                    "BNP",
                    "Other",
                    "None",
                    "Don't know"))


# Finally for our respondent-level data, we need to recode the party identification strength
# variable to include those respondents with no party identification, whichb we'll also set
# as the reference category. We'll also mark any cases where the respondent says that they
# "Don't know" how strongly they identified as NA.

dta$str <- 
  dta$str %>% 
  as_factor() %>% 
  mark_na("Don't know") %>% 
  fct_explicit_na("None") %>% 
  fct_relevel("None")



# Next, we must move on to the discussant-level covariates. These are tricky, so the code might
# get quite long below. First, we'll start by converting the dyadic relationship variables to
# factors using the lapply() function.

dta[, paste0("rel", 1:3)] <-
  dta[, paste0("rel", 1:3)] %>%
  lapply(as_factor)


# Now we'll convert the discussants' voting variable so that it takes the same values as the
# respondent party identification variable.

dta[, paste0("vote", 1:3)] <-
  dta[, paste0("vote", 1:3)] %>%
  lapply(as_factor) %>%
  lapply(droplevels) %>% 
  lapply(factor, labels = c("Labour",
                            "Conservative",
                            "Liberal Democrat",
                            "SNP",
                            "Plaid Cymru",
                            "Green Party",
                            "UKIP",
                            "BNP",
                            "Other",
                            "None",
                            "Don't know"))


# Next, we need to create a count of discussants for each respondent. We can do this by adding
# up how many NAs the respondent has across the 3 "rel" columns, then subtracting the answer
# from 3 -- the maximum number of discussants.
dta$n_disc <- 3 - rowSums(is.na(dta[, paste0("rel", 1:3)]) == T)


# Convert to data.frame
dta <- dta %>% data.frame()


# One complicating factor is that some respondents report fewer than 3 discussants, but do not
# necessary put those that they do report in the first slot. Sometimes, for example, respondents
# with 2 discussants will use the 2nd and 3rd slot, or the 1st and 3rd slot. We need to deal with
# this and make sure that where the respondent has fewer than 3 discussants, they always appear in
# the respective column. E.g. a respondent with 1 discussant will have responses in only column 1
# and a respondent with 2 discussants will have responses in only columns 1 and 2. As you can see
# from the output below this is not currently the case. This takes the form of a lot of recoding.

dta[, paste0("rel", 1:3)][dta$n_disc == 2 & is.na(dta$rel1) == T, ] <-
  tibble(rel1 = dta$rel2[dta$n_disc == 2 & is.na(dta$rel1) == T],
         rel2 = dta$rel3[dta$n_disc == 2 & is.na(dta$rel1) == T],
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 2 & is.na(dta$vote1) == T, ] <-
  tibble(vote1 = dta$vote2[dta$n_disc == 2 & is.na(dta$vote1) == T],
         vote2 = dta$vote3[dta$n_disc == 2 & is.na(dta$vote1) == T],
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 2 & is.na(dta$app1) == T, ] <-
  tibble(app1 = dta$app2[dta$n_disc == 2 & is.na(dta$app1) == T],
         app2 = dta$app3[dta$n_disc == 2 & is.na(dta$app1) == T],
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 2 & is.na(dta$rel2) == T, ] <-
  tibble(rel1 = dta$rel1[dta$n_disc == 2 & is.na(dta$rel2) == T],
         rel2 = dta$rel3[dta$n_disc == 2 & is.na(dta$rel2) == T],
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 2 & is.na(dta$vote2) == T, ] <-
  tibble(vote1 = dta$vote1[dta$n_disc == 2 & is.na(dta$vote2) == T],
         vote2 = dta$vote3[dta$n_disc == 2 & is.na(dta$vote2) == T],
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 2 & is.na(dta$app2) == T, ] <-
  tibble(app1 = dta$app1[dta$n_disc == 2 & is.na(dta$app2) == T],
         app2 = dta$app3[dta$n_disc == 2 & is.na(dta$app2) == T],
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel2) == T, ] <-
  tibble(rel1 = dta$rel3[dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel2) == T],
         rel2 = NA,
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote2) == T, ] <-
  tibble(vote1 = dta$vote3[dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote2) == T],
         vote2 = NA,
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app2) == T, ] <-
  tibble(app1 = dta$app3[dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app2) == T],
         app2 = NA,
         app3 = NA)

dta[, paste0("rel", 1:3)][dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel3) == T, ] <-
  tibble(rel1 = dta$rel2[dta$n_disc == 1 & is.na(dta$rel1) == T & is.na(dta$rel3) == T],
         rel2 = NA,
         rel3 = NA)

dta[, paste0("vote", 1:3)][dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote3) == T, ] <-
  tibble(vote1 = dta$vote2[dta$n_disc == 1 & is.na(dta$vote1) == T & is.na(dta$vote3) == T],
         vote2 = NA,
         vote3 = NA)

dta[, paste0("app", 1:3)][dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app3) == T, ] <-
  tibble(app1 = dta$app2[dta$n_disc == 1 & is.na(dta$app1) == T & is.na(dta$app3) == T],
         app2 = NA,
         app3 = NA)


# Because of the way that the multimembership models work, we cannot have any empty columns.
# This is difficult where respondents have only 1 or 2 discussants. Fortunately, this type
# of model does include weights for the multimembership structure. As such, where we have
# missing data, we can repeat discussants across multiple columns but weight them to sum to
# only a single individual. For example, where a respondent has 3 discussants, each column
# has a value and each discussant receives a weight of 1. Where a respondent has only 2
# discussants, we repeat the second instance across the third column then assign each column
# weights of 1, .5, and .5, which sum to 2 -- the number of discussants.

dta[, c("rel3", "vote3", "app3")][dta$n_disc == 2, ] <- 
  dta[, c("rel2", "vote2", "app2")][dta$n_disc == 2, ]

dta[, c("rel2", "vote2", "app2")][dta$n_disc == 1, ] <-
  dta[, c("rel1", "vote1", "app1")][dta$n_disc == 1, ]

dta[, c("rel3", "vote3", "app3")][dta$n_disc == 1, ] <-
  dta[, c("rel1", "vote1", "app1")][dta$n_disc == 1, ]


# Now we'll create the weighting variables. As well as count *how many* discussants each
# respondent has, these also account for how often they discussed politics with them. We
# use the "pol" variables to do this, then create the weights where respondents have 3,
# 2, and 1 discussants.

dta[, paste0("w", 1:3)] <- NA

dta[, paste0("pol", 1:3)] <- 
  dta[, paste0("pol", 1:3)] %>% 
  lapply(as_factor) %>% 
  lapply(mark_na, "Don't know") %>% 
  lapply(as.numeric) %>% 
  lapply(subtract, 1) %>% 
  lapply(divide_by, 7)

dta[, paste0("w", 1:3)][dta$n_disc == 3, ] <- 
  dta[, paste0("pol", 1:3)][dta$n_disc == 3, ]

dta[, paste0("w", 1:3)][dta$n_disc == 2, ] <- 
  tibble(w1 = dta$pol1[dta$n_disc == 2],
         w2 = dta$pol2[dta$n_disc == 2]/2,
         w3 = dta$pol2[dta$n_disc == 2]/2)

dta[, paste0("w", 1:3)][dta$n_disc == 1, ] <-
  tibble(w1 = dta$pol1[dta$n_disc == 1]/3,
         w2 = dta$pol1[dta$n_disc == 1]/3,
         w3 = dta$pol1[dta$n_disc == 1]/3)


# Finally, we need to recode our main independent variable -- discussant approval.
# This is a little bit tricky too. First, we'll recode these variables into dummy
# variables that take the value 1 where the respondent said that the discussant
# would care if they voted and 0 otherwise. We will include the original "app"
# variables at the 2nd level, but need to create a new variable to capture the
# main effect at the individual level while accounting for the number of discussants
# that each respondent has. We do this by calculating the weighted average of the
# "app" variables.

dta[, paste0("app", 1:3)] <-
  dta[, paste0("app", 1:3)] %>%
  lapply(as_factor) %>% 
  lapply(as_dummy, "Yes")

dta <-
  dta %>% 
  mutate(
    app_fix1 = app1*w1,
    app_fix2 = app2*w2,
    app_fix3 = app3*w3
  )

dta$app_fix <- rowMeans(dta[, paste0("app_fix", 1:3)])


# Another tricky bit, we need to create new variables that tell us about the nature of
# each respondents-discussant dyad's partisanship. We can do this using the case_when()
# function.

opt <- c("None", "Don't know")

dta <-
  dta %>%
  mutate(
    prt1 = 
      case_when(
        pid == vote1 & !(pid %in% opt) & !(vote1 %in% opt) & pid != "Other" & vote1 != "Other" & is.na(rel1) == F ~ "Shared Partisanship",
        pid != vote1 & !(pid %in% opt) & !(vote1 %in% opt) & is.na(rel1) == F ~ "Opposing Partisanship",
        pid == "Other" & vote1 == "Other" & is.na(rel1) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote1 %in% opt) & is.na(rel1) == F ~ "Respondent Non-partisan",
        vote1 %in% opt & !(pid %in% opt) & is.na(rel1) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote1 %in% opt & is.na(rel1) == F ~ "Shared Non-partisan"),
    prt2 = 
      case_when(
        pid == vote2 & !(pid %in% opt) & !(vote2 %in% opt) & pid != "Other" & vote2 != "Other" & is.na(rel2) == F ~ "Shared Partisanship",
        pid != vote2 & !(pid %in% opt) & !(vote2 %in% opt) & is.na(rel2) == F ~ "Opposing Partisanship",
        pid == "Other" & vote2 == "Other" & is.na(rel2) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote2 %in% opt) & is.na(rel2) == F ~ "Respondent Non-partisan",
        vote2 %in% opt & !(pid %in% opt) & is.na(rel2) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote2 %in% opt & is.na(rel2) == F ~ "Shared Non-partisan"),
    prt3 =
      case_when(
        pid == vote3 & !(pid %in% opt) & !(vote3 %in% opt) & pid != "Other" & vote3 != "Other" & is.na(rel3) == F ~ "Shared Partisanship",
        pid != vote3 & !(pid %in% opt) & !(vote3 %in% opt) & is.na(rel3) == F ~ "Opposing Partisanship",
        pid == "Other" & vote3 == "Other" & is.na(rel3) == F ~ "Opposing Partisanship",
        pid %in% opt & !(vote3 %in% opt) & is.na(rel3) == F ~ "Respondent Non-partisan",
        vote3 %in% opt & !(pid %in% opt) & is.na(rel3) == F ~ "Discussant Non-partisan",
        pid %in% opt & vote3 %in% opt & is.na(rel3) == F ~ "Shared Non-partisan")
  )


# Finally, we'll select only the variables we need for our analysis, then remove any missing
# data using listwise deletion.

dta <-
  dta %>%
  select(wt,
         age,
         female,
         edu,
         cohabit,
         discuss,
         att,
         no_und,
         pol_care,
         duty,
         str,
         turn,
         app_fix,
         num_range("rel", 1:3),
         num_range("app", 1:3),
         num_range("prt", 1:3),
         num_range("w", 1:3)) %>% 
  na.omit()



# 3. Create DPID triptych plot --------------------------------------------

dta1 <- 
  dta %>% 
  select(
    wt,
    app_ = num_range("app", 1:3),
    prt_ = num_range("prt", 1:3)
  ) %>% 
  reshape(.,
          varying = paste0(rep(c("app_", "prt_"), each = 3), 1:3),
          sep = "_",
          direction = "long") %>% 
  na.omit() %>% 
  group_by(prt, app) %>%
  summarize(n = sum(wt)) %>%
  spread(app, n) %>% 
  ungroup() %>% 
  janitor::adorn_percentages(denominator = "row") %>% 
  pivot_longer(cols = c(`0`, `1`),
               names_to = "app",
               values_to = "pct") %>% 
  mutate(
    app = app %>% factor(labels = c("Do not approve", "Approve")),
    prt =
      prt %>% 
      factor(
        levels =
          c(
            "Shared Non-partisan",
            "Discussant Non-partisan",
            "Respondent Non-partisan",
            "Opposing Partisanship",
            "Shared Partisanship"    
          ),
        labels = 
          c(
            "Shared\nNonpartisan",
            "Discussant\nNonpartisan",
            "Respondent\nNonpartisan",
            "Opposing\nPartisanship",
            "Shared\nPartisanship"   
          )
      )
  )

plot1 <- 
  dta1 %>% 
  ggplot(
    aes(
      y = pct,
      x = prt,
      colour = app,
      fill = app,
      label = paste0(format(round(pct, 3)*100, nsmall = 1), "%")
    )
  ) +
  geom_col(position = position_dodge(width = .9),
           show.legend = T,
           width = .7) +
  geom_text(color = "black",
            size = 1.5,
            family = "Cabin",
            position = position_dodge(width = .9),
            vjust = -0.75) +
  labs(subtitle = "Dyadic Party Identification") +
  scale_y_continuous(limits = c(0, 1),
                     labels = scales::percent_format(accuracy = 1)) +
  scale_colour_manual(values = bailey_colours("blue", "blue5")) +
  scale_fill_manual(values = bailey_colours("blue5", "blue10")) +
  theme_bailey() +
  theme(legend.position = "bottom",
        legend.box.margin = margin(-6, -10, -2, -10),
        legend.key.size = unit(.3, "cm"),
        legend.title = element_blank(),
        legend.text = element_text(size = rel(.6), hjust = .5),
        plot.subtitle = element_text(face = "bold"),
        axis.title.x = element_blank(),
        axis.title.y = element_blank(),
        axis.text.y = element_text(size = rel(.8), hjust = 0.5),
        axis.text.x = element_text(size = rel(.8)))


# And, finally, we'll export this plot as a jpeg

jpeg(file = here("_output", "app_w4.jpeg"),
     res = 300,
     width = 2.99,
     height = 2.2,
     units = "in")
plot1
dev.off()
